library(dplyr)
library(vroom)
library(here)
library(skimr)
library(tidyverse)
library(tidyquant)
library(mosaic)
library(GGally)
library(ggplot2)
library(readr)
library(leaflet)
library(ggfortify)
library(janitor)
library(broom)
library(car)
library(huxtable)
library(flextable)
library(kableExtra)
library(ggthemes)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(ggcorrplot)## Rows: 7,323
## Columns: 106
## $ id <dbl> 49091, 50646, 56334, 716…
## $ listing_url <chr> "https://www.airbnb.com/…
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped <date> 2020-06-22, 2020-06-22,…
## $ name <chr> "COZICOMFORT LONG TERM S…
## $ summary <chr> NA, "Fully furnished bed…
## $ space <chr> "This is Room No. 2.(ava…
## $ description <chr> "This is Room No. 2.(ava…
## $ experiences_offered <chr> "none", "none", "none", …
## $ neighborhood_overview <chr> NA, "The serenity & quie…
## $ notes <chr> NA, "Accommodation has a…
## $ transit <chr> NA, "Less than 400m from…
## $ access <chr> NA, "Kitchen, washing fa…
## $ interaction <chr> NA, "We love to host peo…
## $ house_rules <chr> "No smoking indoors. Ple…
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url <chr> "https://a0.muscache.com…
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id <dbl> 266763, 227796, 266763, …
## $ host_url <chr> "https://www.airbnb.com/…
## $ host_name <chr> "Francesca", "Sujatha", …
## $ host_since <date> 2010-10-20, 2010-09-08,…
## $ host_location <chr> "singapore", "Singapore,…
## $ host_about <chr> "I am a private tutor by…
## $ host_response_time <chr> "within an hour", "N/A",…
## $ host_response_rate <chr> "100%", "N/A", "100%", "…
## $ host_acceptance_rate <chr> "N/A", "N/A", "N/A", "10…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url <chr> "https://a0.muscache.com…
## $ host_picture_url <chr> "https://a0.muscache.com…
## $ host_neighbourhood <chr> "Woodlands", "Bukit Tima…
## $ host_listings_count <dbl> 2, 1, 2, 8, 8, 8, 8, 4, …
## $ host_total_listings_count <dbl> 2, 1, 2, 8, 8, 8, 8, 4, …
## $ host_verifications <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified <lgl> FALSE, FALSE, FALSE, TRU…
## $ street <chr> "Singapore, Singapore", …
## $ neighbourhood <chr> "Woodlands", "Bukit Tima…
## $ neighbourhood_cleansed <chr> "Woodlands", "Bukit Tima…
## $ neighbourhood_group_cleansed <chr> "North Region", "Central…
## $ city <chr> "Singapore", "Singapore"…
## $ state <chr> NA, NA, NA, NA, NA, NA, …
## $ zipcode <chr> "730702", "589664", NA, …
## $ market <chr> "Singapore", "Singapore"…
## $ smart_location <chr> "Singapore", "Singapore"…
## $ country_code <chr> "SG", "SG", "SG", "SG", …
## $ country <chr> "Singapore", "Singapore"…
## $ latitude <dbl> 1.44, 1.33, 1.44, 1.35, …
## $ longitude <dbl> 104, 104, 104, 104, 104,…
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ property_type <chr> "Apartment", "Apartment"…
## $ room_type <chr> "Private room", "Private…
## $ accommodates <dbl> 1, 2, 1, 6, 3, 3, 6, 1, …
## $ bathrooms <dbl> 1.0, 1.0, 1.0, 1.0, 0.5,…
## $ bedrooms <dbl> 1, 1, 1, 2, 1, 1, 1, 1, …
## $ beds <dbl> 1, 1, 1, 3, 1, 2, 7, 1, …
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet <dbl> 0, NA, 0, 205, NA, NA, 4…
## $ price <chr> "$84.00", "$80.00", "$70…
## $ weekly_price <chr> NA, "$400.00", NA, NA, "…
## $ monthly_price <chr> "$1,048.00", "$1,600.00"…
## $ security_deposit <chr> NA, NA, NA, "$279.00", "…
## $ cleaning_fee <chr> NA, NA, NA, "$56.00", "$…
## $ guests_included <dbl> 1, 2, 1, 4, 1, 1, 4, 1, …
## $ extra_people <chr> "$14.00", "$20.00", "$14…
## $ minimum_nights <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_nights <dbl> 360, 730, 14, 1125, 1125…
## $ minimum_minimum_nights <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_minimum_nights <dbl> 180, 90, 6, 90, 90, 90, …
## $ minimum_maximum_nights <dbl> 360, 730, 14, 1125, 1125…
## $ maximum_maximum_nights <dbl> 360, 730, 14, 1125, 1125…
## $ minimum_nights_avg_ntm <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_nights_avg_ntm <dbl> 360, 730, 14, 1125, 1125…
## $ calendar_updated <chr> "73 months ago", "71 mon…
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30 <dbl> 30, 30, 30, 30, 30, 30, …
## $ availability_60 <dbl> 60, 60, 60, 60, 60, 60, …
## $ availability_90 <dbl> 90, 90, 90, 90, 90, 90, …
## $ availability_365 <dbl> 365, 365, 365, 365, 365,…
## $ calendar_last_scraped <date> 2020-06-22, 2020-06-22,…
## $ number_of_reviews <dbl> 1, 18, 20, 20, 24, 48, 2…
## $ number_of_reviews_ltm <dbl> 0, 0, 0, 8, 4, 13, 6, 2,…
## $ first_review <date> 2013-10-21, 2014-04-18,…
## $ last_review <date> 2013-10-21, 2014-12-26,…
## $ review_scores_rating <dbl> 94, 91, 98, 89, 83, 88, …
## $ review_scores_accuracy <dbl> 10, 9, 10, 9, 8, 9, 9, 1…
## $ review_scores_cleanliness <dbl> 10, 10, 10, 8, 8, 9, 8, …
## $ review_scores_checkin <dbl> 10, 10, 10, 9, 9, 9, 9, …
## $ review_scores_communication <dbl> 10, 10, 10, 10, 9, 9, 9,…
## $ review_scores_location <dbl> 8, 9, 8, 9, 8, 9, 9, 10,…
## $ review_scores_value <dbl> 8, 9, 9, 9, 8, 9, 8, 10,…
## $ requires_license <lgl> FALSE, FALSE, FALSE, FAL…
## $ license <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> FALSE, FALSE, FALSE, TRU…
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "flexible", "moderate", …
## $ require_guest_profile_picture <lgl> TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ calculated_host_listings_count <dbl> 2, 1, 2, 8, 8, 8, 8, 3, …
## $ calculated_host_listings_count_entire_homes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ calculated_host_listings_count_private_rooms <dbl> 2, 1, 2, 8, 8, 8, 8, 3, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> 0.01, 0.24, 0.18, 0.19, …
We can observe that there are 106 variables/columns, with 7,323 observations/rows.
The data we should coerce into different types are as below: 1) price 2) weekly_price 3) monthly_price 4) security_deposit 5) cleaning_fee 6) extra_people
Let’s convert these characters to numeric.
# Transforming characters into numbers
listings <- listings %>%
mutate(price = parse_number(price)) %>%
mutate(weekly_price = parse_number(weekly_price)) %>%
mutate(monthly_price = parse_number(monthly_price)) %>%
mutate(security_deposit = parse_number(security_deposit)) %>%
mutate(cleaning_fee = parse_number(cleaning_fee)) %>%
mutate(extra_people = parse_number(extra_people))
# Check if coercion was successful
typeof(listings$price)## [1] "double"
## [1] "double"
| Name | listings |
| Number of rows | 7323 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 41 |
| Date | 5 |
| logical | 15 |
| numeric | 45 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 34 | 37 | 0 | 7323 | 0 |
| name | 1 | 1.00 | 1 | 92 | 0 | 6766 | 0 |
| summary | 324 | 0.96 | 1 | 1000 | 0 | 4367 | 0 |
| space | 1957 | 0.73 | 1 | 1000 | 0 | 3139 | 0 |
| description | 249 | 0.97 | 1 | 1000 | 0 | 5180 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 2933 | 0.60 | 3 | 1000 | 0 | 2137 | 0 |
| notes | 3315 | 0.55 | 1 | 1000 | 0 | 1634 | 0 |
| transit | 2895 | 0.60 | 1 | 1000 | 0 | 2212 | 0 |
| access | 2821 | 0.61 | 1 | 1000 | 0 | 2002 | 0 |
| interaction | 3250 | 0.56 | 1 | 1000 | 0 | 1665 | 0 |
| house_rules | 3874 | 0.47 | 1 | 1000 | 0 | 2059 | 0 |
| picture_url | 0 | 1.00 | 81 | 146 | 0 | 6777 | 0 |
| host_url | 0 | 1.00 | 39 | 43 | 0 | 2466 | 0 |
| host_name | 22 | 1.00 | 1 | 35 | 0 | 1739 | 0 |
| host_location | 41 | 0.99 | 2 | 183 | 0 | 217 | 0 |
| host_about | 2441 | 0.67 | 1 | 2858 | 0 | 1174 | 1 |
| host_response_time | 22 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 22 | 1.00 | 2 | 4 | 0 | 56 | 0 |
| host_acceptance_rate | 22 | 1.00 | 2 | 4 | 0 | 79 | 0 |
| host_thumbnail_url | 22 | 1.00 | 55 | 106 | 0 | 2448 | 0 |
| host_picture_url | 22 | 1.00 | 57 | 109 | 0 | 2448 | 0 |
| host_neighbourhood | 842 | 0.89 | 4 | 18 | 0 | 62 | 0 |
| host_verifications | 0 | 1.00 | 2 | 158 | 0 | 187 | 0 |
| street | 0 | 1.00 | 13 | 61 | 0 | 93 | 0 |
| neighbourhood | 2 | 1.00 | 5 | 18 | 0 | 45 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 4 | 23 | 0 | 43 | 0 |
| neighbourhood_group_cleansed | 0 | 1.00 | 11 | 17 | 0 | 5 | 0 |
| city | 64 | 0.99 | 1 | 20 | 0 | 39 | 0 |
| state | 6817 | 0.07 | 1 | 39 | 0 | 50 | 0 |
| zipcode | 818 | 0.89 | 1 | 9 | 0 | 1975 | 0 |
| market | 90 | 0.99 | 9 | 11 | 0 | 2 | 0 |
| smart_location | 0 | 1.00 | 9 | 31 | 0 | 43 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| country | 0 | 1.00 | 9 | 9 | 0 | 1 | 0 |
| property_type | 0 | 1.00 | 3 | 22 | 0 | 26 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 977 | 0 | 5621 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 13 | 0 | 79 | 0 |
| cancellation_policy | 0 | 1.00 | 8 | 27 | 0 | 5 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-22 | 2020-06-23 | 2020-06-22 | 2 |
| host_since | 22 | 1.00 | 2009-06-29 | 2020-06-14 | 2016-04-09 | 1576 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-22 | 2020-06-23 | 2020-06-22 | 2 |
| first_review | 2835 | 0.61 | 2011-05-04 | 2020-06-22 | 2018-08-10 | 1730 |
| last_review | 2835 | 0.61 | 2013-10-21 | 2020-06-22 | 2019-12-25 | 1158 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 7323 | 0 | NaN | : |
| medium_url | 7323 | 0 | NaN | : |
| xl_picture_url | 7323 | 0 | NaN | : |
| host_is_superhost | 22 | 1 | 0.16 | FAL: 6143, TRU: 1158 |
| host_has_profile_pic | 22 | 1 | 1.00 | TRU: 7282, FAL: 19 |
| host_identity_verified | 22 | 1 | 0.22 | FAL: 5668, TRU: 1633 |
| is_location_exact | 0 | 1 | 0.80 | TRU: 5845, FAL: 1478 |
| has_availability | 0 | 1 | 1.00 | TRU: 7323 |
| requires_license | 0 | 1 | 0.00 | FAL: 7323 |
| license | 7323 | 0 | NaN | : |
| jurisdiction_names | 7323 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.42 | FAL: 4227, TRU: 3096 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 7323 |
| require_guest_profile_picture | 0 | 1 | 0.00 | FAL: 7289, TRU: 34 |
| require_guest_phone_verification | 0 | 1 | 0.01 | FAL: 7276, TRU: 47 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.69e+07 | 1.22e+07 | 4.91e+04 | 1.70e+07 | 2.90e+07 | 3.83e+07 | 4.39e+07 | ▂▃▅▆▇ |
| scrape_id | 0 | 1.00 | 2.02e+13 | 0.00e+00 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 1.06e+08 | 9.58e+07 | 2.37e+04 | 2.54e+07 | 6.64e+07 | 1.74e+08 | 3.50e+08 | ▇▂▂▂▁ |
| host_listings_count | 22 | 1.00 | 4.96e+01 | 9.04e+01 | 0.00e+00 | 1.00e+00 | 9.00e+00 | 5.10e+01 | 3.92e+02 | ▇▁▁▁▁ |
| host_total_listings_count | 22 | 1.00 | 4.96e+01 | 9.04e+01 | 0.00e+00 | 1.00e+00 | 9.00e+00 | 5.10e+01 | 3.92e+02 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | 1.31e+00 | 3.00e-02 | 1.24e+00 | 1.30e+00 | 1.31e+00 | 1.32e+00 | 1.45e+00 | ▂▇▂▁▁ |
| longitude | 0 | 1.00 | 1.04e+02 | 4.00e-02 | 1.04e+02 | 1.04e+02 | 1.04e+02 | 1.04e+02 | 1.04e+02 | ▁▁▃▇▁ |
| accommodates | 0 | 1.00 | 3.25e+00 | 2.51e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 4.00e+00 | 1.60e+01 | ▇▂▁▁▁ |
| bathrooms | 3 | 1.00 | 1.54e+00 | 1.16e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.10e+01 | ▇▁▁▁▁ |
| bedrooms | 12 | 1.00 | 1.31e+00 | 8.70e-01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 1.00e+01 | ▇▁▁▁▁ |
| beds | 71 | 0.99 | 1.98e+00 | 2.52e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 5.80e+01 | ▇▁▁▁▁ |
| square_feet | 7292 | 0.00 | 2.05e+02 | 2.46e+02 | 0.00e+00 | 0.00e+00 | 2.00e+02 | 3.18e+02 | 1.00e+03 | ▇▃▁▁▁ |
| price | 0 | 1.00 | 1.76e+02 | 4.83e+02 | 1.40e+01 | 6.30e+01 | 1.24e+02 | 1.90e+02 | 1.30e+04 | ▇▁▁▁▁ |
| weekly_price | 6857 | 0.06 | 8.36e+02 | 9.80e+02 | 1.00e+02 | 3.99e+02 | 5.70e+02 | 1.00e+03 | 1.47e+04 | ▇▁▁▁▁ |
| monthly_price | 6826 | 0.07 | 2.90e+03 | 3.51e+03 | 4.50e+02 | 1.30e+03 | 2.10e+03 | 3.60e+03 | 5.50e+04 | ▇▁▁▁▁ |
| security_deposit | 2217 | 0.70 | 2.31e+02 | 5.25e+02 | 0.00e+00 | 0.00e+00 | 1.50e+02 | 2.50e+02 | 6.99e+03 | ▇▁▁▁▁ |
| cleaning_fee | 1947 | 0.73 | 3.89e+01 | 4.29e+01 | 0.00e+00 | 1.50e+01 | 3.00e+01 | 5.00e+01 | 9.00e+02 | ▇▁▁▁▁ |
| guests_included | 0 | 1.00 | 1.87e+00 | 1.60e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 5.00e+01 | ▇▁▁▁▁ |
| extra_people | 0 | 1.00 | 1.47e+01 | 2.16e+01 | 0.00e+00 | 0.00e+00 | 1.00e+01 | 2.00e+01 | 4.04e+02 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 2.07e+01 | 4.69e+01 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 1.80e+01 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 9.27e+02 | 1.25e+03 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 1.00e+05 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 2.01e+01 | 4.35e+01 | 1.00e+00 | 1.00e+00 | 4.00e+00 | 1.80e+01 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 2.07e+01 | 4.42e+01 | 1.00e+00 | 2.00e+00 | 5.00e+00 | 1.80e+01 | 1.00e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 8.81e+05 | 4.35e+07 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 2.15e+09 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 1.17e+06 | 5.02e+07 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 2.15e+09 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 2.03e+01 | 4.37e+01 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 1.80e+01 | 1.00e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 1.17e+06 | 5.02e+07 | 1.00e+00 | 1.12e+03 | 1.12e+03 | 1.12e+03 | 2.15e+09 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 1.58e+01 | 1.35e+01 | 0.00e+00 | 0.00e+00 | 2.10e+01 | 2.90e+01 | 3.00e+01 | ▇▁▁▂▇ |
| availability_60 | 0 | 1.00 | 3.43e+01 | 2.69e+01 | 0.00e+00 | 0.00e+00 | 5.10e+01 | 5.90e+01 | 6.00e+01 | ▆▁▁▁▇ |
| availability_90 | 0 | 1.00 | 5.33e+01 | 4.01e+01 | 0.00e+00 | 0.00e+00 | 8.10e+01 | 8.90e+01 | 9.00e+01 | ▅▁▁▁▇ |
| availability_365 | 0 | 1.00 | 1.95e+02 | 1.59e+02 | 0.00e+00 | 1.00e+00 | 1.81e+02 | 3.63e+02 | 3.65e+02 | ▆▂▂▁▇ |
| number_of_reviews | 0 | 1.00 | 1.25e+01 | 3.16e+01 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 8.00e+00 | 3.70e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 3.79e+00 | 9.27e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 2.00e+00 | 1.39e+02 | ▇▁▁▁▁ |
| review_scores_rating | 2969 | 0.59 | 9.06e+01 | 1.25e+01 | 2.00e+01 | 8.70e+01 | 9.40e+01 | 1.00e+02 | 1.00e+02 | ▁▁▁▂▇ |
| review_scores_accuracy | 2974 | 0.59 | 9.23e+00 | 1.28e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 2972 | 0.59 | 8.99e+00 | 1.36e+00 | 2.00e+00 | 9.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | ▁▁▁▂▇ |
| review_scores_checkin | 2978 | 0.59 | 9.51e+00 | 1.07e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_communication | 2974 | 0.59 | 9.49e+00 | 1.11e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_location | 2979 | 0.59 | 9.38e+00 | 1.04e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_value | 2978 | 0.59 | 8.96e+00 | 1.27e+00 | 2.00e+00 | 9.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | ▁▁▁▂▇ |
| calculated_host_listings_count | 0 | 1.00 | 4.37e+01 | 7.52e+01 | 1.00e+00 | 2.00e+00 | 1.00e+01 | 5.20e+01 | 3.35e+02 | ▇▂▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 3.18e+01 | 7.26e+01 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 2.30e+01 | 3.30e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 8.90e+00 | 2.47e+01 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 4.00e+00 | 1.36e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 4.50e-01 | 2.11e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 2.00e+01 | ▇▁▁▁▁ |
| reviews_per_month | 2835 | 0.61 | 7.60e-01 | 1.33e+00 | 1.00e-02 | 1.10e-01 | 3.00e-01 | 8.70e-01 | 2.87e+01 | ▇▁▁▁▁ |
Let’s change NAs to meaningful entries. For example missing security_deposit or cleaning_fee can very well mean they are zero. Both of their completion rates are about 70%.
# Handling missing values
listings <- listings %>%
mutate(cleaning_fee = case_when(
is.na(cleaning_fee) ~ 0,
TRUE ~ cleaning_fee
)) %>%
mutate(security_deposit = case_when(
is.na(security_deposit) ~ 0,
TRUE ~ security_deposit
))Count distinct property types
# Count and calculate proportion
type <- listings %>%
count(property_type) %>%
arrange(desc(n)) %>%
mutate(proportion = 100 * n/sum(n))
type %>%
kable()| property_type | n | proportion |
|---|---|---|
| Apartment | 2882 | 39.355 |
| Condominium | 2136 | 29.168 |
| Serviced apartment | 657 | 8.972 |
| House | 501 | 6.841 |
| Hostel | 319 | 4.356 |
| Hotel | 170 | 2.321 |
| Boutique hotel | 155 | 2.117 |
| Townhouse | 112 | 1.529 |
| Loft | 105 | 1.434 |
| Bed and breakfast | 76 | 1.038 |
| Bungalow | 66 | 0.901 |
| Other | 57 | 0.778 |
| Guest suite | 18 | 0.246 |
| Guesthouse | 17 | 0.232 |
| Villa | 16 | 0.218 |
| Aparthotel | 10 | 0.137 |
| Boat | 7 | 0.096 |
| Tent | 6 | 0.082 |
| Campsite | 3 | 0.041 |
| Chalet | 3 | 0.041 |
| Tiny house | 2 | 0.027 |
| Bus | 1 | 0.014 |
| Cabin | 1 | 0.014 |
| Earth house | 1 | 0.014 |
| Heritage hotel (India) | 1 | 0.014 |
| Igloo | 1 | 0.014 |
## [1] 84.3
We need to add an extra column prop_type_simplified with just 5 categories.
# Create an extra column with simplified
listings <- listings %>%
mutate(prop_type_simplified = case_when(
property_type %in% c("Apartment","Condominium", "Serviced apartment","House") ~ property_type,
TRUE ~ "Other"
))After the above operation, a skim should find out that prop_type_simplified has only 5 categories, much more simplified compared to property_type having 26 categories. The check below (provided by the project instruction website) also works.
# Check if prop_type_simplified is correctly created
listings %>%
count(prop_type_simplified) %>%
arrange(desc(n)) %>%
kable()| prop_type_simplified | n |
|---|---|
| Apartment | 2882 |
| Condominium | 2136 |
| Other | 1147 |
| Serviced apartment | 657 |
| House | 501 |
As we are only concerned with listings for travel purpose. We filter data such that we only have observations with minimum_nights <= 4
# Filter to obtain listings for travel purpose
listings1 <- listings %>%
filter(minimum_nights <= 4)
# Check if we successfully filtered
fav_stats(listings1$minimum_nights)| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 2 | 3 | 4 | 1.87 | 0.946 | 3713 | 0 |
# Does guest_included push up price?
p_v_guest <- listings1 %>%
ggplot(mapping = aes(x = guests_included, y = price)) +
geom_point(alpha = 0.5, colour = "red") +
xlab("guests included")
p_v_guestp_v_cleaning <- listings1 %>%
ggplot(listings1, mapping = aes(x = cleaning_fee, y = price)) +
geom_point(alpha = 0.5, colour = "blue") +
xlab("cleaning fee")
p_v_cleaningIt looks like both graphs look too clustered near the origin point. There are also outliers on x-axes with orders of magnitudes higher than points near origin. Let’s apply log() to both axes and see how.
p_v_guest_log <- listings1 %>%
ggplot(mapping = aes(x = log(guests_included), y = log(price))) +
geom_point(alpha = 0.5, colour = "red") +
geom_smooth() +
xlab("log(guests included)")
p_v_guest_logp_v_cleaning_log <- listings1 %>%
ggplot(listings1, mapping = aes(x = log(cleaning_fee), y = log(price))) +
geom_point(alpha = 0.5, colour = "blue") +
geom_smooth() +
xlab("log(cleaning fee)")
p_v_cleaning_logDeduction: It is interesting to see that there is a positive correlation in both cases, meaning certain proportional increases in guests included/cleaning fee can be associated with proportional increases in prices.
leaflet(data = listings1) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = "blue",
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type)To get a sense of the zoning and the locations of each simplified property type, we change leaflet operations a bit as follows:
# See the unique levels of prop_type_simplified and neighbourhood_group_cleansed
unique(listings1$neighbourhood_group_cleansed)## [1] "East Region" "North-East Region" "West Region"
## [4] "Central Region" "North Region"
## [1] "House" "Condominium" "Apartment"
## [4] "Other" "Serviced apartment"
# Count each unique levels under prop_type_simplified
n_c_count <- listings1 %>%
count(neighbourhood_group_cleansed) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
n_c_count| neighbourhood_group_cleansed | n | total | proportion |
|---|---|---|---|
| Central Region | 2868 | 3713 | 77.2 |
| East Region | 293 | 3713 | 7.89 |
| West Region | 286 | 3713 | 7.7 |
| North-East Region | 166 | 3713 | 4.47 |
| North Region | 100 | 3713 | 2.69 |
# Create a palette that maps neighbourhood_group_cleansed factor levels to colors
pal1 <- colorFactor(c("navy", "red", "yellow", "orange", "green"),
domain = c("Central Region", "East Region", "West Region","North-East Region", "North Region" ))
# Apply the palette to actual data
leaflet(data = listings1) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~pal1(neighbourhood_group_cleansed),
radius = 3,
stroke = FALSE,
fillOpacity = 0.5)We speculate that most AirBnb customers could be tourists of other countries rather than locals. From the cluster mapping and count table, Central Region is most concentrated in listings, consistent with the intuition that Central Region has some of the hottest sightseeing (or meeting spots for businessmen). East Region is close to Changi Airport, a top international transfer choice and one of the busiest airport in the world. The associated demand could impact price. Zoning can well be an independent variable that contributes to pricing.
# Count each unique levels under prop_type_simplified
s_p_count <- listings1 %>%
count(prop_type_simplified) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
s_p_count| prop_type_simplified | n | total | proportion |
|---|---|---|---|
| Apartment | 1448 | 3713 | 39 |
| Other | 953 | 3713 | 25.7 |
| Condominium | 803 | 3713 | 21.6 |
| House | 310 | 3713 | 8.35 |
| Serviced apartment | 199 | 3713 | 5.36 |
# Create a palette that maps prop_type_simplified factor levels to colors
pal2 <- colorFactor(c("navy", "red", "yellow", "black", "green"),
domain = c("Apartment", "Other", "Condominium", "House", "Serviced apartment"))
# Apply the palette to actual data
leaflet(data = listings1) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~pal2(prop_type_simplified),
radius = 3,
stroke = FALSE,
fillOpacity = 0.5)From both count and mapping, we can see that Apartment is the most widely offered property type. Given its geographical span, it could well mean prices are quite dispersed within the category. House and Serivced aparment are most clustered in the Central Region. We speculate that the ‘prop_type_simplified’ could enhance a likely high price that comes with its region identity.
price_4_nights or price_4_nights_log# Create a new variable called price_4_nights
extra_listings <- listings1 %>%
filter(maximum_nights >= 4) %>% # The listing must accept at least 4 nights of stay
mutate(price_4_nights =
ifelse(guests_included <= 1, # If only meant to accommodate 1 guest
(price + extra_people) * 4 + cleaning_fee, # Need extra_people charge
(price) * 4 + cleaning_fee)) # Price if can accommodate 2 or more guests
# See key stats of price for 2 people 4 nights in Singapore
fav_stats(extra_listings$price_4_nights)| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 56 | 280 | 494 | 815 | 5.2e+04 | 814 | 2.5e+03 | 3654 | 0 |
##
## Call:
## density.default(x = extra_listings$price_4_nights)
##
## Data: extra_listings$price_4_nights (3654 obs.); Bandwidth 'bw' = 69.65
##
## x y
## Min. : -153 Min. :0.000000
## 1st Qu.:12938 1st Qu.:0.000000
## Median :26028 Median :0.000000
## Mean :26028 Mean :0.000019
## 3rd Qu.:39118 3rd Qu.:0.000000
## Max. :52209 Max. :0.001361
ggplot(extra_listings,
aes(x = price_4_nights)) +
geom_density()+
theme_economist() +
xlab("price for 2 people staying 4 nights")ggplot(extra_listings, aes(x = log(price_4_nights))) +
geom_density()+
theme_economist() +
xlab("log(price)")## NULL
Looking at our data, we believe that using the log(price_4_nights) would be a better variable to use for our regression model. This is so that a change in dependent variable could mean a multiplied % change in price_4_nights, rather than multiplied increments of coefficients. Otherwise, for data with price_4_nights several orders of magnitude larger, a model would need a disproportionate increase in independent variables to conform to a dependent variable.
# Create a variable of log price data
extra_listings_log <- extra_listings %>%
mutate(price_4_nights_log = log(price_4_nights))
#ggpairs(extra_listings_log$["price_4_nights_log","prop_type_simplified","number_of_reviews","review_scores_rating"])
#ggpairs(extra_listings_log[,c("price_4_nights_log","prop_type_simplified","number_of_reviews","review_scores_rating")])We need to see what are some possible numeric variables that could impact prices. Due to the display constraint of a correlation matrix in a R studio window. We divide the numeric variables into two groups. Also, we sieve out variables that we think are logically irrelevant to prices, such as id, minimum_nights, listings_count, availability, longitude, latitude.
# Create a dataframe for group 1 of numeric variables
matrix_data1 <- extra_listings_log %>%
select_if(is.numeric) %>%
select(!ends_with("id")) %>%
select(!contains("minimum_nights")) %>%
select(!contains("maximum_nights")) %>%
select(!contains("listings_count")) %>%
select(!starts_with("availability")) %>%
select(!contains("latitude")) %>%
select(!contains("longitude")) %>%
select(accommodates:guests_included, price_4_nights_log) %>%
select(-price)
# Visualize correlation matrix for group 1 of numeric variables
corr1 <- cor(matrix_data1)
ggcorrplot(corr1,
lab = TRUE)# Create a dataframe for group 2 of numeric variables
matrix_data2 <- extra_listings_log %>%
select_if(is.numeric) %>%
select(!ends_with("id")) %>%
select(!contains("minimum_nights")) %>%
select(!contains("maximum_nights")) %>%
select(!contains("listings_count")) %>%
select(!starts_with("availability")) %>%
select(!contains("latitude")) %>%
select(!contains("longitude")) %>%
select(extra_people:price_4_nights_log) %>%
select(-price_4_nights)
# Visualize correlation matrix for group 1 of numeric variables
corr2 <- cor(matrix_data2)
ggcorrplot(corr2,
lab = TRUE)Unfortunately, the correlation coefficients observed are relatively small. But this does not put us off because it confirms the need to incorporate other logical or character variables in predicting price_4_nights_log. Variables having a correlation coefficients greater or equal to 0.1 with price_4_nights_log include bedrooms, square_feet, guests_included, cleaning_fee, security_deposit, extra_people, review_scores_cleanliness. Although these will be numeric variables to closely watch later in models, except bedrooms, they are the variables not normally considered before booking an Airbnb. They can be theoretically useful, but may not apply in real life for predictions.
# Model 1, includes simplified property type, number of reviews, review ratings
model1 <- lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating,
data = extra_listings_log)
# Check if there are clear differences between prices of different property types
favstats(~price_4_nights | prop_type_simplified, data = extra_listings_log)| prop_type_simplified | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Apartment | 56 | 272 | 422 | 756 | 4.01e+04 | 700 | 1.67e+03 | 1426 | 0 |
| Condominium | 60 | 356 | 628 | 915 | 4.02e+04 | 855 | 2.17e+03 | 793 | 0 |
| House | 131 | 250 | 400 | 586 | 3.56e+04 | 775 | 2.89e+03 | 304 | 0 |
| Other | 60 | 236 | 404 | 759 | 1.18e+04 | 630 | 805 | 934 | 0 |
| Serviced apartment | 348 | 632 | 947 | 1.28e+03 | 5.2e+04 | 2.41e+03 | 7.65e+03 | 197 | 0 |
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.746432 0.101091 56.84 < 2e-16
## prop_type_simplifiedCondominium 0.192579 0.037372 5.15 2.8e-07
## prop_type_simplifiedHouse -0.217104 0.051998 -4.18 3.1e-05
## prop_type_simplifiedOther -0.146498 0.034495 -4.25 2.2e-05
## prop_type_simplifiedServiced apartment 0.605880 0.067233 9.01 < 2e-16
## number_of_reviews -0.000999 0.000323 -3.10 0.002
## review_scores_rating 0.004810 0.001109 4.34 1.5e-05
##
## (Intercept) ***
## prop_type_simplifiedCondominium ***
## prop_type_simplifiedHouse ***
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews **
## review_scores_rating ***
##
## Residual standard error: 0.678 on 2426 degrees of freedom
## (1221 observations deleted due to missingness)
## Multiple R-squared: 0.0829, Adjusted R-squared: 0.0806
## F-statistic: 36.6 on 6 and 2426 DF, p-value: <2e-16
| price 4 nights log | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 5.75 | 5.55 – 5.94 | <0.001 |
|
prop_type_simplified [Condominium] |
0.19 | 0.12 – 0.27 | <0.001 |
|
prop_type_simplified [House] |
-0.22 | -0.32 – -0.12 | <0.001 |
|
prop_type_simplified [Other] |
-0.15 | -0.21 – -0.08 | <0.001 |
|
prop_type_simplified [Serviced apartment] |
0.61 | 0.47 – 0.74 | <0.001 |
| number_of_reviews | -0.00 | -0.00 – -0.00 | 0.002 |
| review_scores_rating | 0.00 | 0.00 – 0.01 | <0.001 |
| Observations | 2433 | ||
| R2 / R2 adjusted | 0.083 / 0.081 | ||
vif(model1) %>% #calculate variance-inflation and generalized variance-inflation factors
kable() %>% #table type in HTML
kable_styling(bootstrap_options = "striped", full_width = F, position = "left") #styling of table| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| prop_type_simplified | 1.02 | 4 | 1.00 |
| number_of_reviews | 1.02 | 1 | 1.01 |
| review_scores_rating | 1.02 | 1 | 1.01 |
Interpret the coefficient review_scores_rating in terms of price_4_nights: On average for 2 listings that have a difference of 1 in their review_scores_rating, the listing with the higher review will have a higher price_4_nights_log of 0.0048099. This means a higher price_4_nights of of 0.48% (or \(e^{0.0048099}-1\)). The t-value is 4.338 > 2, and p value = 1.49e-05 < 0.05, these mean the coefficient is statistically significant at a 95% confidence level.
Interpret the coefficient prop_type_simplified in terms of price_4_nights: The regression model uses Apartment as the base level for prop_type_simplified and the coefficients of the property types can be stated as: On average, a Condominium will have a higher price_4_nights_log of 0.1925787 as compared to an Apartment. This means a higher price of 21.23% (or \(e^{0.1925787}-1\)) than an Apartment. On average, a House will have a lower price of 19.51% (or \(e^{-0.2171037}-1\)) than an Apartment. On average, a property type of Other will have a lower price of -13.63% (or \(e^{-0.1464977}-1\)) than an Apartment. On average, a Serviced apartment will have a higher price of 83.29% (or \(e^{0.6058804}-1\)) than an Apartment. All of the absolute values of coefficients’ t-value are greater than 2, and p value < 0.05, meaning the coefficients are statistically significant at a 95% confidence level.
The adjusted r square for this model is 0.08064 or 8.1%. This means that the model with these 3 variables account for only 8.1% of the variation in the price_4_nights_log.
#Model 2, includes property type, number of reviews, reviews ratings and room type
model2 <- lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type,
data = extra_listings_log)
# Display table of coefficients
summary(model2)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type, data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.660 -0.351 -0.050 0.294 4.720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.298363 0.083995 74.99 < 2e-16
## prop_type_simplifiedCondominium 0.044420 0.030806 1.44 0.15
## prop_type_simplifiedHouse -0.002812 0.043087 -0.07 0.95
## prop_type_simplifiedOther 0.284729 0.033598 8.47 < 2e-16
## prop_type_simplifiedServiced apartment 0.675706 0.058336 11.58 < 2e-16
## number_of_reviews -0.001409 0.000264 -5.34 1.0e-07
## review_scores_rating 0.003885 0.000908 4.28 1.9e-05
## room_typeHotel room -0.980829 0.047213 -20.77 < 2e-16
## room_typePrivate room -0.806641 0.026242 -30.74 < 2e-16
## room_typeShared room -1.425375 0.057221 -24.91 < 2e-16
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.553 on 2423 degrees of freedom
## (1221 observations deleted due to missingness)
## Multiple R-squared: 0.391, Adjusted R-squared: 0.388
## F-statistic: 173 on 9 and 2423 DF, p-value: <2e-16
# Count different room types
extra_listings_log %>%
count(room_type) %>%
arrange(desc(n)) %>%
kable()| room_type | n |
|---|---|
| Private room | 1754 |
| Entire home/apt | 1336 |
| Hotel room | 333 |
| Shared room | 231 |
### Model 2 Analysis
Model 2 includes the same variables as model 1 (prop_type_simplified, number_of_reviews, review_scores_rating) with a new added variable of room_type. As shown from the table above there are 4 main room types (Private room, Entire home/apt, Hotel room, Shared room). Model 2 uses the Entire home/apt as a base to assess the impact of the other room types on price.
From the model we can see that all 3 other room types have a negative relationship to an Entire home/apt. This makes sense as it would be more expensive to rent out an entire place as compared to just a single Private room or Shared room. A Shared room has the biggest negative relationship as compared to other room types which again makes sense as we expect a Shared room to be the cheapest as compared to the other more private options. The price of a Shared room is 75.96% (or \(e^{-1.4253747}-1\)) cheaper than the price of an Entire home/apt. The price of a Hotel room and Private room are 62.50% (or \(e^{-0.9808295}-1\)) and 55.36% (\(e^{-0.8066407}-1\) cheaper as compared to the price of an Entire home/apt. All of the absolute values of coefficients’ t-value are greater than 2, and p value < 0.05, meaning the coefficients are statistically significant at a 95% confidence level.
The adjusted r square for this model is 0.3883 or 38.8%. This means that model 2 accounts for 38.8% of the variation in the price_4_nights_log. This also shows that the room_type is a good variable to explain the variation as the r square increased by 30.7% from model 1 to model 2.
At this stage, when we think of if bedrooms, bathrooms, beds can impact price_4_nights_log, we realize that only when room_type is Entire home/apt, these 3 variables truly matter. So we first employ filter and count functions to see if the Entire home/apt and other room types differ significantly for these 3 variables.
entire_count_bedrooms <- extra_listings_log %>%
filter(room_type == "Entire home/apt") %>%
count(bedrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
entire_count_bedrooms| bedrooms | n | total | proportion |
|---|---|---|---|
| 2 | 441 | 1336 | 33 |
| 1 | 427 | 1336 | 32 |
| 3 | 227 | 1336 | 17 |
| 0 | 163 | 1336 | 12.2 |
| 4 | 47 | 1336 | 3.52 |
| 5 | 15 | 1336 | 1.12 |
| 6 | 6 | 1336 | 0.449 |
| 10 | 5 | 1336 | 0.374 |
| 3 | 1336 | 0.225 | |
| 7 | 2 | 1336 | 0.15 |
entire_count_bathrooms <- extra_listings_log %>%
filter(room_type == "Entire home/apt") %>%
count(bathrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
entire_count_bathrooms| bathrooms | n | total | proportion |
|---|---|---|---|
| 1 | 670 | 1336 | 50.1 |
| 2 | 391 | 1336 | 29.3 |
| 3 | 124 | 1336 | 9.28 |
| 1.5 | 68 | 1336 | 5.09 |
| 2.5 | 35 | 1336 | 2.62 |
| 3.5 | 15 | 1336 | 1.12 |
| 4 | 9 | 1336 | 0.674 |
| 6 | 7 | 1336 | 0.524 |
| 0 | 6 | 1336 | 0.449 |
| 5 | 5 | 1336 | 0.374 |
| 5.5 | 2 | 1336 | 0.15 |
| 4.5 | 1 | 1336 | 0.0749 |
| 7 | 1 | 1336 | 0.0749 |
| 8 | 1 | 1336 | 0.0749 |
| 1 | 1336 | 0.0749 |
entire_count_beds <- extra_listings_log %>%
filter(room_type == "Entire home/apt") %>%
count(beds) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
entire_count_beds| beds | n | total | proportion |
|---|---|---|---|
| 1 | 428 | 1336 | 32 |
| 2 | 369 | 1336 | 27.6 |
| 3 | 235 | 1336 | 17.6 |
| 4 | 143 | 1336 | 10.7 |
| 5 | 61 | 1336 | 4.57 |
| 0 | 29 | 1336 | 2.17 |
| 6 | 28 | 1336 | 2.1 |
| 8 | 1336 | 0.599 | |
| 8 | 7 | 1336 | 0.524 |
| 7 | 6 | 1336 | 0.449 |
| 10 | 6 | 1336 | 0.449 |
| 16 | 6 | 1336 | 0.449 |
| 9 | 3 | 1336 | 0.225 |
| 14 | 3 | 1336 | 0.225 |
| 11 | 2 | 1336 | 0.15 |
| 13 | 1 | 1336 | 0.0749 |
| 23 | 1 | 1336 | 0.0749 |
private_count_bedrooms <- extra_listings_log %>%
filter(room_type == "Private room") %>%
count(bedrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
private_count_bedrooms| bedrooms | n | total | proportion |
|---|---|---|---|
| 1 | 1553 | 1754 | 88.5 |
| 0 | 99 | 1754 | 5.64 |
| 2 | 66 | 1754 | 3.76 |
| 3 | 20 | 1754 | 1.14 |
| 7 | 1754 | 0.399 | |
| 4 | 4 | 1754 | 0.228 |
| 6 | 3 | 1754 | 0.171 |
| 5 | 2 | 1754 | 0.114 |
private_count_bathrooms <- extra_listings_log %>%
filter(room_type == "Private room") %>%
count(bathrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
private_count_bathrooms| bathrooms | n | total | proportion |
|---|---|---|---|
| 1 | 1309 | 1754 | 74.6 |
| 2 | 145 | 1754 | 8.27 |
| 1.5 | 115 | 1754 | 6.56 |
| 0 | 44 | 1754 | 2.51 |
| 0.5 | 34 | 1754 | 1.94 |
| 2.5 | 28 | 1754 | 1.6 |
| 3 | 25 | 1754 | 1.43 |
| 4 | 21 | 1754 | 1.2 |
| 6 | 9 | 1754 | 0.513 |
| 8 | 8 | 1754 | 0.456 |
| 5 | 6 | 1754 | 0.342 |
| 12 | 4 | 1754 | 0.228 |
| 7 | 3 | 1754 | 0.171 |
| 9 | 2 | 1754 | 0.114 |
| 1 | 1754 | 0.057 |
private_count_beds <- extra_listings_log %>%
filter(room_type == "Private room") %>%
count(beds) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
private_count_beds| beds | n | total | proportion |
|---|---|---|---|
| 1 | 1209 | 1754 | 68.9 |
| 2 | 340 | 1754 | 19.4 |
| 3 | 49 | 1754 | 2.79 |
| 0 | 46 | 1754 | 2.62 |
| 4 | 39 | 1754 | 2.22 |
| 26 | 1754 | 1.48 | |
| 6 | 14 | 1754 | 0.798 |
| 8 | 10 | 1754 | 0.57 |
| 7 | 7 | 1754 | 0.399 |
| 5 | 6 | 1754 | 0.342 |
| 10 | 4 | 1754 | 0.228 |
| 14 | 2 | 1754 | 0.114 |
| 16 | 2 | 1754 | 0.114 |
hotel_count_bedrooms <- extra_listings_log %>%
filter(room_type == "Hotel room") %>%
count(bedrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
hotel_count_bedrooms| bedrooms | n | total | proportion |
|---|---|---|---|
| 1 | 263 | 333 | 79 |
| 0 | 36 | 333 | 10.8 |
| 2 | 22 | 333 | 6.61 |
| 3 | 9 | 333 | 2.7 |
| 4 | 1 | 333 | 0.3 |
| 7 | 1 | 333 | 0.3 |
| 8 | 1 | 333 | 0.3 |
hotel_count_bathrooms <- extra_listings_log %>%
filter(room_type == "Hotel room") %>%
count(bathrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
hotel_count_bathrooms| bathrooms | n | total | proportion |
|---|---|---|---|
| 1 | 128 | 333 | 38.4 |
| 3 | 55 | 333 | 16.5 |
| 4 | 43 | 333 | 12.9 |
| 2 | 36 | 333 | 10.8 |
| 6 | 17 | 333 | 5.11 |
| 0 | 11 | 333 | 3.3 |
| 5 | 8 | 333 | 2.4 |
| 1.5 | 7 | 333 | 2.1 |
| 8.5 | 7 | 333 | 2.1 |
| 10 | 5 | 333 | 1.5 |
| 16 | 4 | 333 | 1.2 |
| 8 | 3 | 333 | 0.901 |
| 13 | 3 | 333 | 0.901 |
| 0.5 | 2 | 333 | 0.601 |
| 2.5 | 1 | 333 | 0.3 |
| 7 | 1 | 333 | 0.3 |
| 9 | 1 | 333 | 0.3 |
| 12.5 | 1 | 333 | 0.3 |
hotel_count_beds <- extra_listings_log %>%
filter(room_type == "Hotel room") %>%
count(beds) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
hotel_count_beds| beds | n | total | proportion |
|---|---|---|---|
| 1 | 157 | 333 | 47.1 |
| 2 | 52 | 333 | 15.6 |
| 10 | 20 | 333 | 6.01 |
| 8 | 13 | 333 | 3.9 |
| 3 | 12 | 333 | 3.6 |
| 6 | 11 | 333 | 3.3 |
| 16 | 11 | 333 | 3.3 |
| 0 | 10 | 333 | 3 |
| 4 | 9 | 333 | 2.7 |
| 14 | 9 | 333 | 2.7 |
| 30 | 9 | 333 | 2.7 |
| 12 | 6 | 333 | 1.8 |
| 18 | 6 | 333 | 1.8 |
| 13 | 2 | 333 | 0.601 |
| 5 | 1 | 333 | 0.3 |
| 7 | 1 | 333 | 0.3 |
| 9 | 1 | 333 | 0.3 |
| 36 | 1 | 333 | 0.3 |
| 58 | 1 | 333 | 0.3 |
| 1 | 333 | 0.3 |
shared_count_bedrooms <- extra_listings_log %>%
filter(room_type == "Shared room") %>%
count(bedrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
shared_count_bedrooms| bedrooms | n | total | proportion |
|---|---|---|---|
| 1 | 231 | 231 | 100 |
shared_count_bathrooms <- extra_listings_log %>%
filter(room_type == "Shared room") %>%
count(bathrooms) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
shared_count_bathrooms| bathrooms | n | total | proportion |
|---|---|---|---|
| 1 | 77 | 231 | 33.3 |
| 0 | 33 | 231 | 14.3 |
| 3 | 31 | 231 | 13.4 |
| 2 | 23 | 231 | 9.96 |
| 0.5 | 11 | 231 | 4.76 |
| 4 | 11 | 231 | 4.76 |
| 5 | 8 | 231 | 3.46 |
| 1.5 | 7 | 231 | 3.03 |
| 2.5 | 7 | 231 | 3.03 |
| 7 | 7 | 231 | 3.03 |
| 6 | 6 | 231 | 2.6 |
| 8 | 4 | 231 | 1.73 |
| 9 | 2 | 231 | 0.866 |
| 21 | 2 | 231 | 0.866 |
| 7.5 | 1 | 231 | 0.433 |
| 10 | 1 | 231 | 0.433 |
shared_count_beds <- extra_listings_log %>%
filter(room_type == "Shared room") %>%
count(beds) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
shared_count_beds| beds | n | total | proportion |
|---|---|---|---|
| 1 | 117 | 231 | 50.6 |
| 2 | 25 | 231 | 10.8 |
| 4 | 16 | 231 | 6.93 |
| 6 | 15 | 231 | 6.49 |
| 16 | 10 | 231 | 4.33 |
| 8 | 9 | 231 | 3.9 |
| 10 | 7 | 231 | 3.03 |
| 32 | 5 | 231 | 2.16 |
| 5 | 231 | 2.16 | |
| 12 | 4 | 231 | 1.73 |
| 14 | 4 | 231 | 1.73 |
| 3 | 3 | 231 | 1.3 |
| 0 | 2 | 231 | 0.866 |
| 5 | 2 | 231 | 0.866 |
| 9 | 2 | 231 | 0.866 |
| 20 | 2 | 231 | 0.866 |
| 7 | 1 | 231 | 0.433 |
| 28 | 1 | 231 | 0.433 |
| 46 | 1 | 231 | 0.433 |
room_type_count <- extra_listings_log %>%
count(room_type) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
room_type_count| room_type | n | total | proportion |
|---|---|---|---|
| Private room | 1754 | 3654 | 48 |
| Entire home/apt | 1336 | 3654 | 36.6 |
| Hotel room | 333 | 3654 | 9.11 |
| Shared room | 231 | 3654 | 6.32 |
The above count and filter operations show that for Entire home/apt, bedrooms, beds, bathrooms counts differ significantly within itself. (And Entire home/apt accounts for 36.56% of data set, significant!) Within other types, the 3 variables will have much more significant counts for “1” over other numbers. Therefore, it is worth considering to downplay the importance ofbedrooms, beds, bathrooms, in predicting the price_4_nights_log for Private room, Hotel room, Shared room. For now, we will still handle the 4 room_types under the same model.
Do we want to build a separate model for `Entire home/apt??? probably at the end in conclusion? to show our critical thinking
# Model 3, includes property type, number of reviews, reviews ratings,room type, bedrooms, bathrooms and beds
model3 <-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
beds,
data = extra_listings_log)
# Display coefficients
summary(model3)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## beds, data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.048 -0.296 -0.042 0.271 4.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.704932 0.084485 67.53 < 2e-16
## prop_type_simplifiedCondominium 0.026866 0.028600 0.94 0.348
## prop_type_simplifiedHouse -0.019362 0.039933 -0.48 0.628
## prop_type_simplifiedOther 0.317102 0.031962 9.92 < 2e-16
## prop_type_simplifiedServiced apartment 0.660393 0.054394 12.14 < 2e-16
## number_of_reviews -0.001273 0.000245 -5.20 2.1e-07
## review_scores_rating 0.005828 0.000854 6.82 1.1e-11
## room_typeHotel room -0.780591 0.046247 -16.88 < 2e-16
## room_typePrivate room -0.647655 0.025728 -25.17 < 2e-16
## room_typeShared room -1.233408 0.054809 -22.50 < 2e-16
## bathrooms -0.018002 0.007902 -2.28 0.023
## bedrooms 0.272009 0.014032 19.38 < 2e-16
## beds -0.003766 0.003554 -1.06 0.290
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## bathrooms *
## bedrooms ***
## beds
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.511 on 2404 degrees of freedom
## (1237 observations deleted due to missingness)
## Multiple R-squared: 0.48, Adjusted R-squared: 0.477
## F-statistic: 185 on 12 and 2404 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.96 4 1.09
## number_of_reviews 1.03 1 1.01
## review_scores_rating 1.05 1 1.02
## room_type 2.30 3 1.15
## bathrooms 1.39 1 1.18
## bedrooms 1.40 1 1.18
## beds 1.42 1 1.19
model3_1 <-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms,
data = extra_listings_log)
# Display coefficients
summary(model3_1)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms,
## data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.045 -0.298 -0.042 0.271 4.710
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.702491 0.083941 67.93 < 2e-16
## prop_type_simplifiedCondominium 0.029510 0.028584 1.03 0.3020
## prop_type_simplifiedHouse -0.020137 0.039906 -0.50 0.6139
## prop_type_simplifiedOther 0.322511 0.031664 10.19 < 2e-16
## prop_type_simplifiedServiced apartment 0.669639 0.054221 12.35 < 2e-16
## number_of_reviews -0.001302 0.000244 -5.33 1.1e-07
## review_scores_rating 0.005844 0.000847 6.90 6.8e-12
## room_typeHotel room -0.793785 0.045507 -17.44 < 2e-16
## room_typePrivate room -0.644492 0.025695 -25.08 < 2e-16
## room_typeShared room -1.239739 0.054107 -22.91 < 2e-16
## bathrooms -0.020562 0.007727 -2.66 0.0078
## bedrooms 0.268517 0.013545 19.82 < 2e-16
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## bathrooms **
## bedrooms ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.512 on 2417 degrees of freedom
## (1225 observations deleted due to missingness)
## Multiple R-squared: 0.478, Adjusted R-squared: 0.476
## F-statistic: 202 on 11 and 2417 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.90 4 1.08
## number_of_reviews 1.02 1 1.01
## review_scores_rating 1.04 1 1.02
## room_type 2.16 3 1.14
## bathrooms 1.33 1 1.15
## bedrooms 1.30 1 1.14
Model 3 includes the same variables as model 2 (property_type, number of reviews, review_scores_rating, room_type) with 3 new added variables of bathrooms, bedrooms and beds. Adjusted R square improves largely from 38.3% to 47.7%. But it looks like beds has a low |t-value| < 2 and large p-value > 0.05. For later models, beds should be excluded. And after exclusion we get adjusted R square = 47.6%, about the same as before.
Model 4 consists of the same variables of model 3 with the added variablehost_is_superhost specifying if the host is a super host.
# Model 4, includes property type, number of reviews, reviews ratings,room type, bedrooms, bathrooms, and is the host a super host
model4<-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
host_is_superhost,
data = extra_listings_log)
summary(model4)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## host_is_superhost, data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.144 -0.308 -0.035 0.273 4.730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.749358 0.085402 67.32 < 2e-16
## prop_type_simplifiedCondominium 0.026704 0.028684 0.93 0.35197
## prop_type_simplifiedHouse -0.030721 0.040007 -0.77 0.44262
## prop_type_simplifiedOther 0.312977 0.031784 9.85 < 2e-16
## prop_type_simplifiedServiced apartment 0.655642 0.054375 12.06 < 2e-16
## number_of_reviews -0.001354 0.000245 -5.53 3.5e-08
## review_scores_rating 0.005094 0.000872 5.84 5.8e-09
## room_typeHotel room -0.787715 0.045633 -17.26 < 2e-16
## room_typePrivate room -0.641314 0.025932 -24.73 < 2e-16
## room_typeShared room -1.241512 0.054209 -22.90 < 2e-16
## bathrooms -0.018057 0.007760 -2.33 0.02005
## bedrooms 0.270599 0.013695 19.76 < 2e-16
## host_is_superhostTRUE 0.118067 0.030562 3.86 0.00011
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## bathrooms *
## bedrooms ***
## host_is_superhostTRUE ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.512 on 2402 degrees of freedom
## (1239 observations deleted due to missingness)
## Multiple R-squared: 0.481, Adjusted R-squared: 0.479
## F-statistic: 186 on 12 and 2402 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.92 4 1.08
## number_of_reviews 1.03 1 1.01
## review_scores_rating 1.10 1 1.05
## room_type 2.18 3 1.14
## bathrooms 1.34 1 1.16
## bedrooms 1.32 1 1.15
## host_is_superhost 1.09 1 1.04
### Model 4 Analysis
Based on the regression, the fact of if a host is a super host has some effect on the price of the property. As shown from the table above, if the host is a super host (i.e. host_is_superhost = TRUE), the price would increase by 12.53% \(e^{0.1180666}-1\). The t-value and p-value are statistically significant at 95% level.
Model 4 has an adjusted R square of 47.86%, which means that the variables in model 4 explains only 47.86% of the variation in price_4_nights_log. Model 3 on the other hand has an adjusted R square of 47.60%. The different in adjusted r squares for the 2 models is only 0.26%. This tells us that the variable host_is_superhost might not be a good indicator for the variation in price of an Airbnb.
Model 5 consists of the same variables of model 4 with the added variable is_location_exact on if the exact location of the Air BNB is given.
model5<-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
host_is_superhost +
is_location_exact,
data = extra_listings_log)
summary(model5)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## host_is_superhost + is_location_exact, data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.065 -0.305 -0.028 0.270 4.701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.663481 0.088165 64.24 < 2e-16
## prop_type_simplifiedCondominium 0.024968 0.028610 0.87 0.38290
## prop_type_simplifiedHouse -0.041084 0.039992 -1.03 0.30439
## prop_type_simplifiedOther 0.306420 0.031745 9.65 < 2e-16
## prop_type_simplifiedServiced apartment 0.651628 0.054237 12.01 < 2e-16
## number_of_reviews -0.001413 0.000245 -5.78 8.6e-09
## review_scores_rating 0.005182 0.000870 5.96 2.9e-09
## room_typeHotel room -0.781930 0.045534 -17.17 < 2e-16
## room_typePrivate room -0.632975 0.025956 -24.39 < 2e-16
## room_typeShared room -1.232961 0.054108 -22.79 < 2e-16
## bathrooms -0.017484 0.007740 -2.26 0.02398
## bedrooms 0.270130 0.013658 19.78 < 2e-16
## host_is_superhostTRUE 0.116816 0.030480 3.83 0.00013
## is_location_exactTRUE 0.097542 0.025884 3.77 0.00017
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## bathrooms *
## bedrooms ***
## host_is_superhostTRUE ***
## is_location_exactTRUE ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.51 on 2401 degrees of freedom
## (1239 observations deleted due to missingness)
## Multiple R-squared: 0.484, Adjusted R-squared: 0.481
## F-statistic: 173 on 13 and 2401 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.93 4 1.09
## number_of_reviews 1.03 1 1.01
## review_scores_rating 1.10 1 1.05
## room_type 2.20 3 1.14
## bathrooms 1.34 1 1.16
## bedrooms 1.32 1 1.15
## host_is_superhost 1.09 1 1.04
## is_location_exact 1.02 1 1.01
### Model 5 Analysis
As with the super host variable in model 4 the fact if the exact location of the Air BNB is given seems insignificant. As shown from the table above, if the exact location is given, the price would increase by 10.25% (\(e^{0.0975425}-1\). The t-value and p-value are statistically significant at 95% level.
Model 5 has an adjusted R square of 48.14%, which means that the variables in model 5 explains only 48.14% of the variation in log price for 4 nights. Model 4 on the other hand has an adjusted R square of 47.86%. The difference in adjusted r squares for the 2 models is only 0.28%. This tells us that the variable on if the exact location of the Airbnb is given might not be a good indicator for the variation in price of an Airbnb.
Based on the data set we have, we observed that the variable neighborhood_group_cleansed groups the neighborhoods in Singapore into 5 main areas: Central Region, East Region, West Region, North-East Region, North Region. While we could use these regions to account for the different neighborhoods in Singapore, we decided to regroup the neighborhoods in Singapore based on price per square feet. Our new variable, neighborhood_simplified, groups the neighborhoods in Singapore into 4 main regions, A,B,C and D. The neighborhoods in A consists of the areas with the highest price per square feet, which is mostly the areas around Central Singapore, while the price per square feet decreases in each group as they move away from Central Singapore
#count(distinct(extra_listings_log$neighbourhood_group_cleansed))
#skim(extra_listings_log)
# Create neighborhood_simplified
extra_listings_log <- extra_listings_log %>%
mutate(neighbourhood_simplified = case_when(
neighbourhood_cleansed %in% c("Rochor", "Downtown Core", "River Valley","Singapore River","Orchard","Tanglin","Southern Islands","Museum","Marina South") ~ "A",
neighbourhood_cleansed %in% c("Kallang", "Geylang","Outram", "Novena", "Bukit Merah","Queenstown","Bukit Timah","Toa Payoh","Marine Parade","Clementi","Newton","Bishan","Ang Mo Kio","Central Water Catchment") ~ "B",
neighbourhood_cleansed %in% c("Bedok","Jurong West","Woodlands","Hougang","Jurong East","Chua Chu Kang","Serangoon","Tampines","Bukit Batok","Sengkang","Pasir Ris","Yishun","Punggol","Bukit Panjang","Sembawang","Pioneer","Sungei Kadut","Tuas") ~ "C",
TRUE ~ "D"))After creating our own neighbourhood_simplified, we can map them and see how they are distributed
# Count each unique levels under neighbourhood_simplified
n_simplfied_count <- extra_listings_log %>%
count(neighbourhood_simplified) %>%
arrange(desc(n)) %>%
mutate(total = sum(n)) %>%
mutate(proportion = 100 * n/sum(n))
n_simplfied_count| neighbourhood_simplified | n | total | proportion |
|---|---|---|---|
| B | 2061 | 3654 | 56.4 |
| A | 864 | 3654 | 23.6 |
| C | 690 | 3654 | 18.9 |
| D | 39 | 3654 | 1.07 |
# Create a palette that maps prop_type_simplified factor levels to colors
pal3 <- colorFactor(c("navy", "red", "yellow", "black"),
domain = c("B", "A", "C", "D"))
# Apply the palette to actual data
leaflet(data = extra_listings_log) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~pal3(neighbourhood_simplified),
radius = 3,
stroke = FALSE,
fillOpacity = 0.5)Now, let’s build a regression model 6, that’s model 5 including an additional variable neighbourhood_simplified.
| neighbourhood_simplified | n |
|---|---|
| B | 2061 |
| A | 864 |
| C | 690 |
| D | 39 |
| neighbourhood_group_cleansed | n |
|---|---|
| Central Region | 2837 |
| East Region | 281 |
| West Region | 280 |
| North-East Region | 161 |
| North Region | 95 |
model6<-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
host_is_superhost +
is_location_exact +
neighbourhood_simplified,
data = extra_listings_log)
summary(model6)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## host_is_superhost + is_location_exact + neighbourhood_simplified,
## data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.022 -0.288 -0.021 0.252 4.749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.839150 0.089345 65.36 < 2e-16
## prop_type_simplifiedCondominium 0.026885 0.028198 0.95 0.3405
## prop_type_simplifiedHouse -0.020663 0.039843 -0.52 0.6041
## prop_type_simplifiedOther 0.272795 0.031513 8.66 < 2e-16
## prop_type_simplifiedServiced apartment 0.644998 0.053537 12.05 < 2e-16
## number_of_reviews -0.001418 0.000241 -5.88 4.6e-09
## review_scores_rating 0.005062 0.000858 5.90 4.1e-09
## room_typeHotel room -0.774654 0.044881 -17.26 < 2e-16
## room_typePrivate room -0.615885 0.026338 -23.38 < 2e-16
## room_typeShared room -1.241076 0.053474 -23.21 < 2e-16
## bathrooms -0.019421 0.007652 -2.54 0.0112
## bedrooms 0.273705 0.013480 20.31 < 2e-16
## host_is_superhostTRUE 0.117738 0.030136 3.91 9.6e-05
## is_location_exactTRUE 0.083461 0.025784 3.24 0.0012
## neighbourhood_simplifiedB -0.201694 0.025015 -8.06 1.2e-15
## neighbourhood_simplifiedC -0.216492 0.034488 -6.28 4.1e-10
## neighbourhood_simplifiedD -0.376206 0.094974 -3.96 7.7e-05
##
## (Intercept) ***
## prop_type_simplifiedCondominium
## prop_type_simplifiedHouse
## prop_type_simplifiedOther ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews ***
## review_scores_rating ***
## room_typeHotel room ***
## room_typePrivate room ***
## room_typeShared room ***
## bathrooms *
## bedrooms ***
## host_is_superhostTRUE ***
## is_location_exactTRUE **
## neighbourhood_simplifiedB ***
## neighbourhood_simplifiedC ***
## neighbourhood_simplifiedD ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.503 on 2398 degrees of freedom
## (1239 observations deleted due to missingness)
## Multiple R-squared: 0.5, Adjusted R-squared: 0.497
## F-statistic: 150 on 16 and 2398 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 2.03 4 1.09
## number_of_reviews 1.03 1 1.02
## review_scores_rating 1.10 1 1.05
## room_type 2.36 3 1.15
## bathrooms 1.35 1 1.16
## bedrooms 1.32 1 1.15
## host_is_superhost 1.10 1 1.05
## is_location_exact 1.04 1 1.02
## neighbourhood_simplified 1.24 3 1.04
Model 6 adds on to model 5 with the added variable of neighborhood_simplified which accounts for the different neighborhoods in Singapore.
The model uses neighborhood group A as the baseline and compares the other neighborhoods against A. As we expected, neighborhood A has the highest average price with the other neighborhoods being on average lower than group A. Neighborhood B for example of average has a price that is 18.27% (\(e^{-0.2016941}-1\) lower in price as compared to neighborhood A. Neighborhood C has an average price which is 19.47% (\(e^{-0.2164924}-1\) lower than neighborhood A. Again, as we expected, neighborhood C is lower than both neighborhood A and B.
Model 6 has an adjusted R square of 49.68%, which means that the variables in model 5 explains 49.68% of the variation in price_4_nights_log. Model 5 on the other hand has an adjusted R square of 48.14%. The different in adjusted r squares for the 2 models is 1.54%. This shows that the neighborhood_simplified adds marginal value to the model.
model7<-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
host_is_superhost +
is_location_exact +
neighbourhood_simplified +
cancellation_policy,
data = extra_listings_log)
summary(model7)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## host_is_superhost + is_location_exact + neighbourhood_simplified +
## cancellation_policy, data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.963 -0.291 -0.030 0.254 4.695
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 5.750028 0.093225 61.68
## prop_type_simplifiedCondominium 0.026662 0.028090 0.95
## prop_type_simplifiedHouse -0.020389 0.039718 -0.51
## prop_type_simplifiedOther 0.270925 0.031530 8.59
## prop_type_simplifiedServiced apartment 0.614170 0.053855 11.40
## number_of_reviews -0.001488 0.000242 -6.16
## review_scores_rating 0.005363 0.000858 6.25
## room_typeHotel room -0.752566 0.045091 -16.69
## room_typePrivate room -0.602120 0.026550 -22.68
## room_typeShared room -1.207991 0.054070 -22.34
## bathrooms -0.019900 0.007638 -2.61
## bedrooms 0.273979 0.013448 20.37
## host_is_superhostTRUE 0.108227 0.030164 3.59
## is_location_exactTRUE 0.079084 0.025708 3.08
## neighbourhood_simplifiedB -0.201191 0.025011 -8.04
## neighbourhood_simplifiedC -0.206334 0.034434 -5.99
## neighbourhood_simplifiedD -0.371903 0.094604 -3.93
## cancellation_policymoderate -0.001368 0.034392 -0.04
## cancellation_policystrict_14_with_grace_period 0.093647 0.029292 3.20
## cancellation_policysuper_strict_30 0.826958 0.505235 1.64
## cancellation_policysuper_strict_60 0.814680 0.504675 1.61
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## prop_type_simplifiedCondominium 0.34264
## prop_type_simplifiedHouse 0.60776
## prop_type_simplifiedOther < 2e-16 ***
## prop_type_simplifiedServiced apartment < 2e-16 ***
## number_of_reviews 8.7e-10 ***
## review_scores_rating 4.7e-10 ***
## room_typeHotel room < 2e-16 ***
## room_typePrivate room < 2e-16 ***
## room_typeShared room < 2e-16 ***
## bathrooms 0.00923 **
## bedrooms < 2e-16 ***
## host_is_superhostTRUE 0.00034 ***
## is_location_exactTRUE 0.00212 **
## neighbourhood_simplifiedB 1.4e-15 ***
## neighbourhood_simplifiedC 2.4e-09 ***
## neighbourhood_simplifiedD 8.7e-05 ***
## cancellation_policymoderate 0.96827
## cancellation_policystrict_14_with_grace_period 0.00141 **
## cancellation_policysuper_strict_30 0.10181
## cancellation_policysuper_strict_60 0.10660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.501 on 2394 degrees of freedom
## (1239 observations deleted due to missingness)
## Multiple R-squared: 0.505, Adjusted R-squared: 0.501
## F-statistic: 122 on 20 and 2394 DF, p-value: <2e-16
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 2.10 4 1.10
## number_of_reviews 1.05 1 1.02
## review_scores_rating 1.11 1 1.05
## room_type 2.49 3 1.16
## bathrooms 1.36 1 1.17
## bedrooms 1.33 1 1.15
## host_is_superhost 1.11 1 1.05
## is_location_exact 1.04 1 1.02
## neighbourhood_simplified 1.26 3 1.04
## cancellation_policy 1.18 4 1.02
### Model 7 Analysis
Model 7 has achieved the best adjusted R square so far at 50.08%.
# Add more variables to make the model a better predictor
model8<-lm(price_4_nights_log ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bathrooms +
bedrooms +
host_is_superhost +
is_location_exact +
cancellation_policy,
data = extra_listings_log)
summary(model8)##
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews +
## review_scores_rating + room_type + bathrooms + bedrooms +
## host_is_superhost + is_location_exact + cancellation_policy,
## data = extra_listings_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.009 -0.301 -0.034 0.266 4.656
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 5.585853 0.092528 60.37
## prop_type_simplifiedCondominium 0.025250 0.028491 0.89
## prop_type_simplifiedHouse -0.037991 0.039860 -0.95
## prop_type_simplifiedOther 0.305446 0.031734 9.63
## prop_type_simplifiedServiced apartment 0.616853 0.054556 11.31
## number_of_reviews -0.001474 0.000245 -6.01
## review_scores_rating 0.005488 0.000870 6.31
## room_typeHotel room -0.758748 0.045740 -16.59
## room_typePrivate room -0.618329 0.026237 -23.57
## room_typeShared room -1.201610 0.054753 -21.95
## bathrooms -0.017933 0.007724 -2.32
## bedrooms 0.270420 0.013621 19.85
## host_is_superhostTRUE 0.108287 0.030487 3.55
## is_location_exactTRUE 0.091348 0.025812 3.54
## cancellation_policymoderate -0.018761 0.034822 -0.54
## cancellation_policystrict_14_with_grace_period 0.083848 0.029619 2.83
## cancellation_policysuper_strict_30 0.968193 0.512438 1.89
## cancellation_policysuper_strict_60 0.953047 0.511848 1.86
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## prop_type_simplifiedCondominium 0.37557
## prop_type_simplifiedHouse 0.34063
## prop_type_simplifiedOther < 2e-16 ***
## prop_type_simplifiedServiced apartment < 2e-16 ***
## number_of_reviews 2.2e-09 ***
## review_scores_rating 3.3e-10 ***
## room_typeHotel room < 2e-16 ***
## room_typePrivate room < 2e-16 ***
## room_typeShared room < 2e-16 ***
## bathrooms 0.02033 *
## bedrooms < 2e-16 ***
## host_is_superhostTRUE 0.00039 ***
## is_location_exactTRUE 0.00041 ***
## cancellation_policymoderate 0.59010
## cancellation_policystrict_14_with_grace_period 0.00468 **
## cancellation_policysuper_strict_30 0.05896 .
## cancellation_policysuper_strict_60 0.06273 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.508 on 2397 degrees of freedom
## (1239 observations deleted due to missingness)
## Multiple R-squared: 0.489, Adjusted R-squared: 0.486
## F-statistic: 135 on 17 and 2397 DF, p-value: <2e-16
huxreg("Model 1" = model1, "Model 2" = model2, "Model 3" = model3, "Model 4" = model4, "Model 5" = model5, "Model 6" = model6, "Model 7" = model7, statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05,
stars = NULL)| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | Model 6 | Model 7 | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 5.746 | 6.298 | 5.705 | 5.749 | 5.663 | 5.839 | 5.750 |
| (0.101) | (0.084) | (0.084) | (0.085) | (0.088) | (0.089) | (0.093) | |
| prop_type_simplifiedCondominium | 0.193 | 0.044 | 0.027 | 0.027 | 0.025 | 0.027 | 0.027 |
| (0.037) | (0.031) | (0.029) | (0.029) | (0.029) | (0.028) | (0.028) | |
| prop_type_simplifiedHouse | -0.217 | -0.003 | -0.019 | -0.031 | -0.041 | -0.021 | -0.020 |
| (0.052) | (0.043) | (0.040) | (0.040) | (0.040) | (0.040) | (0.040) | |
| prop_type_simplifiedOther | -0.146 | 0.285 | 0.317 | 0.313 | 0.306 | 0.273 | 0.271 |
| (0.034) | (0.034) | (0.032) | (0.032) | (0.032) | (0.032) | (0.032) | |
| prop_type_simplifiedServiced apartment | 0.606 | 0.676 | 0.660 | 0.656 | 0.652 | 0.645 | 0.614 |
| (0.067) | (0.058) | (0.054) | (0.054) | (0.054) | (0.054) | (0.054) | |
| number_of_reviews | -0.001 | -0.001 | -0.001 | -0.001 | -0.001 | -0.001 | -0.001 |
| (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | |
| review_scores_rating | 0.005 | 0.004 | 0.006 | 0.005 | 0.005 | 0.005 | 0.005 |
| (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | |
| room_typeHotel room | -0.981 | -0.781 | -0.788 | -0.782 | -0.775 | -0.753 | |
| (0.047) | (0.046) | (0.046) | (0.046) | (0.045) | (0.045) | ||
| room_typePrivate room | -0.807 | -0.648 | -0.641 | -0.633 | -0.616 | -0.602 | |
| (0.026) | (0.026) | (0.026) | (0.026) | (0.026) | (0.027) | ||
| room_typeShared room | -1.425 | -1.233 | -1.242 | -1.233 | -1.241 | -1.208 | |
| (0.057) | (0.055) | (0.054) | (0.054) | (0.053) | (0.054) | ||
| bathrooms | -0.018 | -0.018 | -0.017 | -0.019 | -0.020 | ||
| (0.008) | (0.008) | (0.008) | (0.008) | (0.008) | |||
| bedrooms | 0.272 | 0.271 | 0.270 | 0.274 | 0.274 | ||
| (0.014) | (0.014) | (0.014) | (0.013) | (0.013) | |||
| beds | -0.004 | ||||||
| (0.004) | |||||||
| host_is_superhostTRUE | 0.118 | 0.117 | 0.118 | 0.108 | |||
| (0.031) | (0.030) | (0.030) | (0.030) | ||||
| is_location_exactTRUE | 0.098 | 0.083 | 0.079 | ||||
| (0.026) | (0.026) | (0.026) | |||||
| neighbourhood_simplifiedB | -0.202 | -0.201 | |||||
| (0.025) | (0.025) | ||||||
| neighbourhood_simplifiedC | -0.216 | -0.206 | |||||
| (0.034) | (0.034) | ||||||
| neighbourhood_simplifiedD | -0.376 | -0.372 | |||||
| (0.095) | (0.095) | ||||||
| cancellation_policymoderate | -0.001 | ||||||
| (0.034) | |||||||
| cancellation_policystrict_14_with_grace_period | 0.094 | ||||||
| (0.029) | |||||||
| cancellation_policysuper_strict_30 | 0.827 | ||||||
| (0.505) | |||||||
| cancellation_policysuper_strict_60 | 0.815 | ||||||
| (0.505) | |||||||
| #observations | 2433 | 2433 | 2417 | 2415 | 2415 | 2415 | 2415 |
| R squared | 0.083 | 0.391 | 0.480 | 0.481 | 0.484 | 0.500 | 0.505 |
| Adj. R Squared | 0.081 | 0.388 | 0.477 | 0.479 | 0.481 | 0.497 | 0.501 |
| Residual SE | 0.678 | 0.553 | 0.511 | 0.512 | 0.510 | 0.503 | 0.501 |
Based on all our models, we feel that model 7 is the model which would be the best predictor as it has the highest adjusted r square of 0.501. The equation for model 7 is the following:
price_4_nights_log = 5.75 + (0.0267 * prop_type_simplifiedCondominium) + (prop_type_simplifiedHouse * -0.0204) + (prop_type_simplifiedOther * 0.271) + (prop_type_simplifiedServiced apartment * 0.614) + (number_of_reviews * -0.00149) + (0.00536 * review_scores_rating) + (-0.753 * room_typeHotel room ) + (-0.602 * room_typePrivate room) + (-1.21 * room_typeShared room) + (bathrooms * -0.0199) + (bedrooms * 0.274) + (0.108 * host_is_superhostTRUE) + (is_location_exactTRUE * 0.0791) + (neighbourhood_simplifiedB * -0.201) + (neighbourhood_simplifiedC * -0.206) + (neighbourhood_simplifiedD * -0.372) (cancellation_policymoderate * -0.00137) + (cancellation_policystrict_14_with_grace_period * 0.0936) + (cancellation_policysuper_strict_30 * 0.827) + (cancellation_policysuper_strict_60 * 0.815)
We will use our model 7 to try and predict the price for 4 nights in Singapore where it is an apartment with a private room, have at least 10 reviews, and an average rating of at least 90. We added an extra variable of location being in neighborhood A, which is the central location in Singapore. Use your best model to predict the total cost to stay at this Airbnb for 4 nights. Include the appropriate 95% interval with your prediction. Report the point prediction and interval in terms of price_4_nights.
predicted_cost <- exp(5.75 + (0.0267 * 0) +
(0 * -0.0204) +
(0 * 0.271) +
(1 * 0.614) +
(10 * -0.00149) +
(0.00536 * 90) +
(-0.753 * 0 ) +
(-0.602 * 1) +
(-1.21 * 0) +
(0 * -0.0199) +
(0 * 0.274) +
(0.108 * 0) +
(0 * 0.0791) +
(0 * -0.201) +
(0 * -0.206) +
(0 * -0.372) +
(0 * -0.00137) +
(0 * 0.0936) +
(0 * 0.827) +
(0 * 0.815))
print(predicted_cost)## [1] 508
SE_residual <- exp(0.501) # Calculate the Standard Error
upper_interval <- predicted_cost + (2*SE_residual)
lower_interval <- predicted_cost - (2*SE_residual)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k)) #decimal for the intervals
paste("Upper Interval is $",specify_decimal(upper_interval,2))## [1] "Upper Interval is $ 510.80"
## [1] "Lower Interval is $ 504.20"
Based on the model we created, we found that the predicted cost of an Air Bnb in Singapore with the characteristics of it being for 4 nights, in an apartment with a private room, have at least 10 reviews with an average rating of 90 and be located in the central district of Singapore, to be $507.50. Based on our model, we are also 95% confident that the price will be between $504.20 and $510.80.